home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
C/C++ Users Group Library 1996 July
/
C-C++ Users Group Library July 1996.iso
/
listings
/
v_12_02
/
otto
/
bm.c
< prev
next >
Wrap
C/C++ Source or Header
|
1993-12-01
|
6KB
|
242 lines
/************************************************
* Fast String Search
*
* Using a tuned Boyer and Moore algorithm.
*
* This source code is written by Erick Otto.
* Algorithms from Daniel Sunday and Andrew Hume
* are used in this code.
*
* 1993 by Erick Otto.
*
*/
#include <stdio.h>
#include <fcntl.h>
#include "freq.h"
/* Size of text buffer, BUFSIZ */
/* is optimum from stdio.h */
#define TBUF (8*BUFSIZ)
#define MAXPAT 256
/* We use the following variables for the search */
/* */
/* pat : The pattern */
/* dist : A table which contains the offsets */
/* for every char in the pattern from */
/* the end of the pattern */
/* lfc : The least frequent char in the patt */
/* lfcoff : The offset of lfc from end of patt */
/* shift : The leftward re-occurance of the */
/* last charater in positions from the */
/* end of the patt */
int patlen;
unsigned char pat[MAXPAT];
unsigned char dist[MAXPAT];
int lfc, lfcoff;
int shift;
main(argc, argv)
int argc;
char **argv;
{
char buf[TBUF+MAXPAT];
char match[MAXPAT];
register int fd, nread;
register char *beg, *lastnl, *end;
char filename[80];
/* must be at least one arguments */
if (argc < 2 ) {
fprintf(stderr,"\n\tMatch string expected\n");
exit();
}
if (argc < 3 ) {
fprintf(stderr,"\n\tFilename expected\n");
exit();
}
strcpy(match, argv[1]);
strcpy(filename, argv[2]);
build(match, strlen(match));
if ((fd = open(filename,O_RDONLY)) == -1) {
fprintf(stderr,
"Can't open file %s\n", filename);
perror(filename);
exit(1);
}
*buf = '\n'; /* sentinel for printing purposes */
beg = buf+1;
while((nread=read(fd,beg,(&buf[TBUF]-beg)))>0){
end = beg+nread;
lastnl = end;
while(*--lastnl != '\n') ;
/* lastnl points to the newline */
matchpat(buf+1, lastnl-buf);
memcpy(buf+1, lastnl, end-lastnl-1);
beg = buf + 1 + (end-lastnl);
}
close(fd);
}
build(match, m)
unsigned char *match;
register m;
{
register unsigned char *patend, *patptr;
register unsigned char *d;
register int i;
register unsigned char *pos;
unsigned char *lastchar;
int lfcidx;
if(m > MAXPAT) {
printf("Length of the pattern too long!\n");
exit(1);
}
/* initialize the pattern variables */
patlen = m;
memcpy(pat, match, patlen);
/* build the dist table */
d = dist;
/* initialize all elements to the pattern length */
for(i = 0; i < MAXPAT; i++)
d[i] = patlen;
/* set all characters in the pattern */
/* to their relative distances from the */
/* end of the pattern */
patptr = pat;
patend = patptr+patlen-1;
while(patptr < patend) {
d[*patptr] = patend-patptr;
patptr++;
}
d[*patptr] = 0;
/* find the least frequent character */
/* that occurs in the pattern */
lfcidx = 0;
for(i = 1; i < patlen; i++){
if(freq[pat[i]] < freq[pat[lfcidx]])
lfcidx = i;
}
/* set the lf char and offset for */
/* later use */
lfc = pat[lfcidx];
lfcoff = lfcidx - (patlen-1);
/* Look backward and see if we can */
/* find an other occurance of the */
/* same character as the last one */
lastchar = patptr;
for(pos = lastchar-1; pos >= pat; pos--)
if (*pos == *lastchar) break;
/* record the occurance for later use */
shift = lastchar - pos;
}
matchpat(text, n)
unsigned char *text;
int n;
{
register unsigned char *e, *s;
register unsigned char *dp;
register int k;
register int lfco, lfcc;
register unsigned char *p, *q;
register unsigned char *ep;
register char *sp;
register char *nl;
register int t1;
char save[MAXPAT];
dp = dist;
t1 = patlen-1;
sp = save;
s = text+t1;
e = text+n;
memcpy(sp, e, patlen);
memset(e, pat[t1], patlen);
lfco = lfcoff;
lfcc = lfc;
ep = pat + t1;
while(s < e){
/* fast forward through the text */
/* untill we find the last char */
/* of the pattern (unrolled loop) */
k = dp[*s];
while(k){
k = dp[*(s += k)];
k = dp[*(s += k)];
k = dp[*(s += k)];
}
/* If we hit the sentinel at */
/* the end of the buffer we */
/* are done with the buffer */
if(s >= e)
break;
/* Neat little trick, actually */
/* makes things faster. Do a */
/* pre check on the lfc, before */
/* starting the full comparison */
if(s[lfco] != lfcc)
goto mismatch;
/* normal forward string matching */
for(p = pat, q = s-t1; p < ep; ){
if(*q++ != *p++)
goto mismatch;
}
/** A match has been found **/
/* at position q - patlen */
nl=q;
/* look back for the closest newline */
while(*--nl != '\n')
;
while(*++nl != '\n') putchar(*nl);
putchar('\n');
/* we are now at q which is somewhere in
* the text we reported a match by
* printing the line, so any possible
* other matches will NOT have to be
* searched for in this line so forward
* to the end of line
*/
q=nl;
mismatch:
/* use the shift that we calculated */
s += shift;
}
memcpy(e, sp, patlen);
return(0);
}